
/**
 * compute the partial sum of the numers in g_idata and store them in g_odata
 * 
 * store the last element of g_odata in totallength(total sum)
 * 
 */
__global__ void scan(int *g_odata, int *g_idata, int n, int* totallength)
{
	// Dynamically allocated shared memory for scan kernels
	extern __shared__ int temp[];
	int thid = threadIdx.x;
	if(thid <n){
		int pout = 0;
		int pin = 1;
		// Cache the computational window in shared memory
		temp[pout*n + thid] = g_idata[thid];
		for (int offset = 1; offset < n; offset *= 2)
		{
			pout = 1 - pout;
			pin = 1 - pin;
			__syncthreads();
			temp[pout*n+thid] = temp[pin*n+thid];
			if (thid >= offset)
				temp[pout*n+thid] += temp[pin*n+thid - offset];
		}
		__syncthreads();
		g_odata[thid] = temp[pout*n+thid];
		if(thid==(n-1)){
			(*totallength)=temp[pout*n+thid];
		}
	}
	
}

/**
 * compute the partial sum of the numers in g_idata and store them in g_odata
 * 
 */
__global__ void scanclauses(int *g_odata, int *g_idata, int n, int *istrue, int batchsize, int* invalidformula)
{
	//Dynamically allocated shared memory for scan kernels
	extern __shared__ int temp[];
	int thid = threadIdx.x;
	int blockid = blockIdx.x;
	if(thid <n && blockid<batchsize && invalidformula[blockid]!=0){
		int pout = 0;
		int pin = 1;
		// Cache the computational window in shared memory
		temp[pout*n + thid] = g_idata[thid+n*blockid];
		for (int offset = 1; offset < n; offset *= 2)
		{
			pout = 1 - pout;
			pin = 1 - pin;
			__syncthreads();
			temp[pout*n+thid] = temp[pin*n+thid];
			if (thid >= offset)
				temp[pout*n+thid] += temp[pin*n+thid - offset];
		}
		__syncthreads();
		g_odata[thid+blockid*n] = temp[pout*n+thid];
		if(thid==(n-1) && temp[pout*n+thid] == 0){
			//if the total sum is null, it is an empty formula, the formula is verified.
			*istrue=1;
		}
	}
}
